home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Skunkware 5
/
Skunkware 5.iso
/
src
/
Tools
/
freeWAIS-sf-1.1
/
kidsf1.1.pl
< prev
next >
Wrap
Perl Script
|
1995-07-27
|
23KB
|
492 lines
#!/usr/skunk/bin/perl
# kidofwais.pl -- WAIS search interface
# (derived from sonofwais.pl, derived from wais.pl)
#
# a url of "http://your.www.server/cgibindir/kidofwais.pl" will search the
# default WAIS index; a url of the form:
# "http://your.www.server/cgibindir/kidofwais.pl/another_wais_src"
# will search the WAIS index called "another_wais_src".
# ----------- wais.pl and derivatives history:
# Tony Sanders <sanders@bsdi.com>, Nov 1993 Original author of wais.pl
#
# by Eric Lease Morgan, NCSU Libraries, April 1994 (sonofwais.pl)
# Modified to present the user "human-readable" titles, better instructions as
# well as the ability to do repeated searches after receiving results.
# eric_morgan@ncsu.edu
# http://dewey.lib.ncsu.edu/staff/morgan/morgan.html
# To read more about this script try:
# http://dewey.lib.ncsu.edu/staff/morgan/son-of-wais.html
# NOTE: the "headline" code only works right if you are using the "-t URL"
# option when you create the WAIS index using "waisindex"
# ---- Mike Grady, UIUC/CCSO, m-grady@uiuc.edu, http://ewshp2.cso.uiuc.edu/
# further modified by Mike Grady, UIUC/CCSO, June 1994 -- changed title routine
# to use some combination of original here and version from Sean Dowd, EDS
# (that version referred to as wais-ncsa-httpd.pl). Also used file typing idea
# from Sean's version and Kevin Hughes's (EIT) wwwwais.c.
# New: Added a hiliting option that works in conjunction with a second cgi
# script called "print_hit_bold.pl". It attempts to hilite the searchterms
# which occur in the document, and to place an "anchor" just before the
# first occurrence so that the document is positioned to there when it
# is displayed (at least this works with NCSA's HTTPD and Mosaic).
# Seeing the Encyclopedia Britannica's stuff got me to thinking how I
# might do this.
#
# also added support for Multitype (multiformat) query responses (using the
# option "-M TYPE1,TYPE2,..." when waisindexing). Waisindex blows up on me
# if I try to use the "-M" option AND the "-t URL" option. And if I tried
# adding a -T option (at least with TEXT), the "headline" ended up being just
# the filename without any path info. So don't use "-t" OR "-T" in conjunction
# with "-M" (the important thing is: make sure the headline field in the
# waisindex src ".cat" file has the form: "filename /path/to/file/" ).
# The alternate forms of the file available are listed following the "Score
# line. Marc Andreeson's "Mosaic And Wais Tutorial" discusses Multiformat
# indexing at "http://wintermute.ncsa.uiuc.edu:8080/wais-tutorial/wais.html".
# 6/29/94 -- Now have added in support for optional "title tables", both for
# indices themselves (used as title on "search" page), and for filetypes
# that don't lend themselves to extracting a nice title to display on the
# hit list (in my case, pdf type files).
# 6/30/94 -- Added support for searching multiple indices with one search. This
# uses a "Source table" which serves as both a title table for the indices (as
# mentioned above), and as a "process control table". See the comments in the
# sample table for more description. This is all "turned on" by setting the
# the flag $use_Source_table to "1"; if you set it to "0", no table will be
# expected or used.
# 7/20/94 -- Few minor changes in search "numbers" display: removed "lines"
# since it is often meaningless, changed "bytes" to be rounded to nearest kbyte.
# Small changes to <TITLE> processing in &extractTitle.
# 8/4/94 -- fixed a bug with multi-index code, added DEBUG option
# 8/15/94 -- fixed a bug: occasionally WAIS returns two "odd" files that are
# information from index.src and index.cat -- exclude these files from list
# do we need to debug the returned lines from waisq? If so, write into log.
$DEBUG = 0; # set to 1 to turn on debugging code; set next to logfile
$debugLOG = "/usr/httpd/logs/kidsf.log";
# where is your waiq binary?
$waisq = "/usr/local/bin/waisqsf1.1";
# where are your source files?
$waisd = "/usr/wais/wais-sources";
# what database do you want to search? (Default)
$default_src = "gruntsco";
# what is the opening title you want to present to users (default)
$openingTitle = "Search the gruntsco sf HTML archives";
# after searching, what do you want the title to be? (default)
$closingTitle = "Search results from the gruntsco sf HTML archives";
# Use a table to look up title (and other info) to use on Search page?
# If 1, then should supply filename for table.
$use_Source_table = 1;
$Source_table = "/usr/wais/wais-sources/Source_table";
# Specify the directory where your WWW docs reside -- same as in "kidofwais.pl"
# (this is the same path you subtracted when you waisindexed (using -t url))
$wwwDocpath = "/b/httpd/doc/";
# Specify the url for this WWW server -- same as in "kidofwais.pl" script
# (this is the same string you added when you waisindexed (using -t url))
$serverURL = "http://gruntsco.pdev.sco.com/";
# maximum number of hits to return
$max_hits = 40;
# Should we use the "highlighting script" for filetypes that "make sense" for
# it? (1 is yes, 0 is no)
$use_hilite = 0;
# specify the www url to the "highlighting script" (used for .html and .txt)
# whether this is used is controlled by $use_hilite flag.
$hilite_script = "http://gruntsco.pdev.sco.com/cgi-bin/print_hit_bold.pl/";
# specify the "first hit anchor" to be used with hiliting
$anchor = "#first_hit";
# who maintains this service?
#$maintainer = "<A HREF=http://ewshp2.cso.uiuc.edu/index.html>Michael A. Grady</A> (m-grady@uiuc.edu)";
$maintainer = "<A HREF=http://gruntsco.pdev.sco.com/author/hiramc.html>Hiram Clawson (hiramc@sco.COM) x7519</A>";
# and when was it last modified.
$modified = "12 May, 1995";
# you shouldn't have to edit anything below this line, except if you want to change the help text
sub extractTitle {
# try and get the <title> ... </title> field from file
# only try to find it in the first 5 lines, and then give up
local($fl) = @_;
local($intitle) = 0;
local($title) = "$theFile: No title, please let $maintainer know.";
local($linenum) = 1;
local($_);
# read the file and extract the title
# this is a combination of code from Eric Lease Morgan and Sean Dowd
open (FP, "$fl") || return "File $theFile can't be read. Please contact
$maintainer.";
while (<FP>) {
chop;
last if ($linenum > 5);
$linenum ++;
if (/<TITLE\s?>(.*)<\/TITLE\s?>/i) { # all on one line
$title = $1;
last;
}
elsif (/<TITLE\s?>(.*)$/i) { # on multiple lines
$title = $1;
$intitle = 1;
}
elsif (/^(.*)<\/TITLE\s?>/i) { # finish of multiple lines
$title = "$title$1";
$intitle = 0;
last;
}
elsif ($intitle) { # add to title, and keep going
$title = "$title$_";
}
}
close (FP);
$title =~ s/^\s*//; # remove whitespace at front
$title =~ s/\s*$//; # remove whitespace at end
$title = "$theFile: Empty title, please let $maintainer know." unless $title;
return $title;
} # end sub extractTitle
sub extractTableTitle { # Get file titles from table. Add a call to this
# routine for any filetype that you decide to create titles for.
# Currently, only the PDF type is set up to call this. (See the
# subroutine &type_file). If nothing is found, it returns the current
# $doc_title unchanged (which is the filename relative to doc root).
# Table read into array at first reference to it.
local ($fl) = @_;
return $doc_title if ($file_title_table eq "");
local($_,$name_to_find,$table_entry,$filename,$filetitle);
# Change the next line to "$name_to_find = $theFile" if you want to
# use the whole path relative to doc root as the name to lookup in the
# table. The current code uses just the filename w/o path info.
$name_to_find = substr($fl, rindex($fl, '/') + 1); # "basename"
if ($current_file_title_array ne $file_title_table) { # Read in new
# table to array
undef %title_array; # erase current array
$current_file_title_array = $file_title_table;
open (TABLE_TITLE, $file_title_table) || return $doc_title;
while ($table_entry = <TABLE_TITLE>) {
chop;
next if $table_entry =~ /^\s*#/; # skip comments
next if $table_entry =~ /^\s*$/; # skip blank lines
($filename, $filetitle) = split(/~/, $table_entry, 2);
$title_array{$filename} = $filetitle;
}
close (TABLE_TITLE);
}
if ($filetitle = $title_array{$name_to_find}) { return $filetitle;}
else { return $doc_title;}
}
sub send_index {
print "Content-type: text/html\n\n";
print "<HEAD>\n<TITLE>$openingTitle</TITLE>\n<ISINDEX></HEAD>\n";
print "<BODY>\n<H2>", $openingTitle, "</H2>\n";
print "<p>";
print "This is an index of some of the information on this server. ";
print "To use this function, simply enter a query. ";
print "Since this is a WAIS index, you can enter complex queries. For example:<p>";
print "<DT><B>Right-hand truncation</B> (stemming) queries";
print "<DD>The query 'astro*' will find documents containing the words";
print " 'astronomy' as well as 'astrophysics'.<p>";
print "<DT>Boolean '<B>And</B>' queries";
print "<DD>The query 'red and blue' will find the <B>intersection</B> of all";
print " the documents containing the words 'red', and 'blue'.";
print "The use of 'and' limits your retrieval.<p>";
print "<DT>Boolean '<B>Or</B>' queries";
print "<DD>The query 'red or blue' will find the <B>union</B> of all the";
print " documents containing the words 'red' and 'blue'.";
print "The use of 'or' increases your retrieval.<p>";
print "<DT>Boolean '<B>Not</B>' queries";
print "<DD>The query 'red not green' will find all the documents containing";
print " the word 'red', and <B>excluding</B> the documents containing the word 'green'.";
print "The use of 'not' limits your retrieval.<p>";
print "<DT><B>Nested</B> Boolean queries";
print "<DD>The query '(red and green) or blue not pink' will find the union of all";
print " the documents containing the words 'red', and 'green'. It will then add (union)";
print " all documents containing the word 'blue'. Finally, it will exclude all documents";
print " containing the word 'pink'";
print "<HR>";
print "This page is maintained by $maintainer, and it was last modified on $modified.<p>\n";
print "</BODY>\n";
}
sub type_file {
# Set file type based on file extension; also has the "side effect" of
# modifying the $url_to_use if $use_hilite flag is turned on and the
# file type is appropriate to use it.
# You can add other types if you want very easily. Also, if you have
# created a "file_title_table" and want it to be accessed for a
# particular filetype, just add "$doc_title = &extractTableTitle ($_);"
# to the appropriate do{} structure below. See the "pdf" file type
# below for an example.
local($filename) = @_;
local($type) = "";
local($_);
SUFFIX: for ($filename) {
/\.html$/i && do {
$type = "HTML file";
$doc_title = &extractTitle ($the_full_File);
if ($use_hilite) {
$url_to_use = $hilite_script . $filename .
"?" . $query_plus . $anchor;
}
last SUFFIX;
};
/\.te?xt$/i && do {
$type = "text file";
if ($use_hilite) {
$url_to_use = $hilite_script . $filename .
"?" . $query_plus . $anchor;
}
last SUFFIX;
};
/\.gif$/i && do {
$type = "GIF graphic";
last SUFFIX;
};
/\.ps$/i && do {
$type = "PostScript file";
last SUFFIX;
};
/\.pdf$/i && do {
$type = "Acrobat PDF file";
# try to get a better file title
$doc_title = &extractTableTitle ($_);
last SUFFIX;
};
/\.jpg$/i && do {
$type = "JPEG graphic";
last SUFFIX;
};
/\.mpg$/i && do {
$type = "MPEG movie";
last SUFFIX;
};
/\.Z$/i && do {
$type = "compressed file";
last SUFFIX;
};
/\.gz$/i && do {
$type = "compressed file";
last SUFFIX;
};
/\.au$/i && do {
$type = "Sun audio file";
last SUFFIX;
};
/\.hqx$/i && do {
$type = "Binhex file";
last SUFFIX;
};
/\.tar$/i && do {
$type = "tar'red file";
last SUFFIX;
};
$type = "Unknown type"; # "fall thru" default case
} # end suffix
return $type;
} # end sub type_file
sub byscores { $scores[$b] <=> $scores[$a];} # descending numeric sort routine
sub print_it { # Print out the hit list, or if multiple sources, save all
# the info to be printed in a array (where all the info for
# "one" hit becomes one array element), only to be printed
# once all sources are searched. This is so we can sort the
# combined hit lists into descending order by score.
local($line) = @_;
if ($src_multiple) {
if ($line eq "</DL>\n") { # All sources have been searched, we
# can now sort and print the entire
# hit list array.
print @hit_list[ sort byscores $[..$#hit_list ];
print "$line";
} elsif ($line eq "</DD>\n") { # End of one "hit" listing,
$output_hit .= $line; # save in the hit array.
push ( @hit_list, $output_hit );
push ( @scores, $score );
$output_hit = "";
} else { # more stuff for the same hit
$output_hit .= $line;
}
} else {
print "$line";
}
}
sub do_wais {
$src = $default_src;
# if 'PATH_INFO' has a non-null value, then use it as the name of the
# WAIS source to search, otherwise will default to $default_src.
$path_extension = $ENV{'PATH_INFO'};
if ($path_extension =~ /^\/(.+)$/) { $src = $1; }
@Sources = ( $src ); # Initialize array of sources to search
if ( $use_Source_table ) { # Read in the Source table info into
# associative array.
open (INDEX_TITLES, $Source_table) || last;
while (<INDEX_TITLES>) {
chop;
next if /^\s*#/; # skip comments
next if /^\s*$/; # skip blank lines
($src_name, $remainder) = split(/~/, $_, 2);
$src_array{$src_name} = $remainder;
}
close (INDEX_TITLES);
($src_title, $src_multiple, $src_prefix, $file_title_table, $sources) =
split(/~/, $src_array{$src});
if ($src_title ne "") {
$openingTitle = "Search of $src_title";
$closingTitle = "Search results from $src_title";
}
$src_multiple && (@Sources = split(/,/, $sources)); # Store the sources
} # to be searched.
do { &send_index; return; } unless defined @ARGV; # No search terms yet.
local(@query) = @ARGV;
local($pquery) = join(" ", @query);
# NCSA's HTTPD puts backslashes in front of "funny" or "dangerous"
# characters in the input supplied thru argv. In the case of search terms
# for WAIS, this can screw up the search (parens and "*" get backslashed
# and then don't work correctly). So remove the backslashes, AND the
# potentially "dangerous" characters ( ; ` ! ).
$pquery =~ tr/!\;\`\\//d; # just in case, get rid of ;`! and \
@query = split(' ',$pquery); # and recreate query word array
$query_plus = join("+", @query);
print "Content-type: text/html\n\n"; # Start the "html" doc to be returned
print "<HEAD>\n<TITLE>$closingTitle</TITLE>\n<ISINDEX></HEAD>\n";
print "<BODY>\n<H2>", $closingTitle, "</H2>\n";
print "Index <B>\`$src\'</B> contains the following\n";
print "items relevant to <B>\`$pquery\'</B>:<p>\n";
print "<DL>\n";
local($hits, $score, $headline, $lines, $bytes, $type, @types, $date);
$DEBUG && do { open (LOG, ">>$debugLOG") || die "can't open log";};
foreach $src (@Sources) { # Search each indicated index for the terms
($src_prefix, $file_title_table) = (split(/~/, $src_array{$src}))[2,3];
open(WAISQ, "-|") || exec ($waisq, "-c", $waisd, "-m", $max_hits,
"-f", "-", "-S", "$src.src", "-g", @query);
while (<WAISQ>) {
$DEBUG && print LOG $_;
/:score\s+(\d+)/ && ($score = $1);
/:number-of-lines\s+(\d+)/ && ($lines = $1);
/:number-of-bytes\s+(\d+)/ && ($bytes = $1);
/:type "(.*)"/ && (push (@types, $1));
/:headline "(.*)"/ && ($headline = $1);
/:date "(\d+)"/ && ($date = $1, $hits++, &docdone);
}
close(WAISQ);
$total_hits += $hits;
$hits = 0;
}
&print_it ("</DL>\n"); # signal to print out hit array if we've been
# building it (for multiple sources).
print "<HR>";
print "This page is maintained by $maintainer.<p>";
if ($total_hits == 0) { print "Nothing found that matches your query.\n"; }
print "</BODY>\n"; # End the "html" doc being returned
}
sub docdone { # Called for each "hit" returned by waisq
local($endfile,$path_to_file,$file_proper,$file_ext,$multi_type,$alt_count);
if ($headline =~ /Search produced no result/) {
if ($src_multiple) { # don't print source listing if
$hits--; # multi-index search
}
else {
print "</DL><HR>";
print $headline, "<p>\n<PRE>";
local ($count) = 0;
open(WAISCAT, "$waisd/$src.cat") || die "$src.cat: $!";
while (<WAISCAT>) {
last if ($count > 30); # limit to about 30 filenames
$count ++;
s#(Catalog for database:)\s+.*#$1 <B>$src.src</B> -- some of the files are:#;
s#Headline:\s+(.*)#Headline: <A HREF="$1">$1</A>#;
print;
}
close(WAISCAT);
print "\n</PRE>\n";
}
} elsif (($headline =~ /^Information on database:/) ||
($headline =~ /^Catalog for database:/)) {
$hits--;
} else { # this is a "real" hit
($endfile, $path_to_file) = split(' ', $headline);
if ($path_to_file ne "") { # Not indexed with -t url, so headline of
# form: "filename /path/to/file/"
# Multitype indexed files will probably have this form, as at
# least I can't get "-t url" to coexist with "-M type,type"
$the_full_File = $theFile = $path_to_file.$endfile;
$theFile =~ s/^.*$wwwDocpath//i;
$url_to_use = $serverURL.$theFile;
} else { # should have been indexed as "-t url", so headline of form
# http://your_server_url/path/to/actual/file
# get the string to munge
$theFile = $url_to_use = $headline;
# parse out the file name (remove the server URL from the front)
$theFile =~ s/^.*$serverURL//i;
# concatenate the "wwwDocpath" variable with the file name
$the_full_File = $wwwDocpath.$theFile;
}
$last_period = rindex($theFile, "."); # need filename without .ext if
if ($last_period > 0) { # it turns out to be multitype
$file_proper = substr($theFile, 0, $last_period);
$file_ext = substr($theFile, $last_period + 1);
}
$doc_title = $theFile;
$type = &type_file ($theFile); # also modifies $url_to_use if flag
# $use_hilite is set and right filetype
$src_multiple && ($doc_title = "$src_prefix $doc_title");
if ($bytes < 1000) { $calc_bytes = "< 1 Kbyte"; }
else { $calc_bytes = int(($bytes + 500)/1000) . " Kbytes"; }
&print_it ("<DT><A HREF=\"$url_to_use\">$doc_title</A></DT>\n");
&print_it ("<DD>Relevancy score: $score , Size: <B>$calc_bytes</B> , Type: $type\n");
if (($#types > 0) && ($file_proper ne "")) { # Multitype indexing
$alt_count = 0; # offer alternatives
foreach $multi_type (@types) {
next if $multi_type eq $file_ext; # skip, already listed
&print_it ("</DD><DD>... Alternate Types Available: ") if $alt_count == 0;
$alt_count ++;
$theFile = "$file_proper.$multi_type";
$url_to_use = $serverURL.$theFile;
$type = &type_file ($theFile);
&print_it ("<A HREF=\"$url_to_use\">$type</A>, ");
} # end foreach $multi_type
}
&print_it ("</DD>\n");
}
$score = $headline = $lines = $bytes = $type = $date = '';
$file_proper = $file_ext = '';
@types = ();
}
open (STDERR,"> /dev/null");
eval '&do_wais';